In [2]:
#Import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)

import plotly.express as px #for visualization
import matplotlib.pyplot as plt #for visualization 

#Read the dataset
data_df = pd.read_csv("churn.csv")

#Get overview of the data
def dataoveriew(df, message):
    print(f'{message}:n')
    print('Number of rows: ', df.shape[0])
    print("nNumber of features:", df.shape[1])
    print("nData Features:")
    print(df.columns.tolist())
    print("nMissing values:", df.isnull().sum().values.sum())
    print("nUnique values:")
    print(df.nunique())

dataoveriew(data_df, 'Overview of the dataset')
Overview of the dataset:n
Number of rows:  7043
nNumber of features: 21
nData Features:
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
nMissing values: 0
nUnique values:
customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64
In [3]:
import plotly.express as px

# Count the values of the 'Churn' column
target_instance = data_df["Churn"].value_counts().reset_index()

# Rename the columns for easier reference
target_instance.columns = ['Category', 'Count']

# Plot the pie chart
fig = px.pie(
    target_instance, 
    values='Count', 
    names='Category', 
    color_discrete_sequence=["blue", "pink"], 
    title='Distribution of Churn'
)

# Show the pie chart
fig.show()
In [4]:
#Defining bar chart function
def bar(feature, df=data_df ):
    #Groupby the categorical feature
    temp_df = df.groupby([feature, 'Churn']).size().reset_index()
    temp_df = temp_df.rename(columns={0:'Count'})
    #Calculate the value counts of each distribution and it's corresponding Percentages
    value_counts_df = df[feature].value_counts().to_frame().reset_index()
    categories = [cat[1][0] for cat in value_counts_df.iterrows()]
    #Calculate the value counts of each distribution and it's corresponding Percentages
    num_list = [num[1][1] for num in value_counts_df.iterrows()]
    div_list = [element / sum(num_list) for element in num_list]
    percentage = [round(element * 100,1) for element in div_list]
    #Defining string formatting for graph annotation
    #Numeric section
    def num_format(list_instance):
        formatted_str = ''
        for index,num in enumerate(list_instance):
            if index < len(list_instance)-2:
                formatted_str=formatted_str+f'{num}%, ' #append to empty string(formatted_str)
            elif index == len(list_instance)-2:
                formatted_str=formatted_str+f'{num}% & '
            else:
                formatted_str=formatted_str+f'{num}%'
        return formatted_str
    #Categorical section
    def str_format(list_instance):
        formatted_str = ''
        for index, cat in enumerate(list_instance):
            if index < len(list_instance)-2:
                formatted_str=formatted_str+f'{cat}, '
            elif index == len(list_instance)-2:
                formatted_str=formatted_str+f'{cat} & '
            else:
                formatted_str=formatted_str+f'{cat}'
        return formatted_str
    #Running the formatting functions
    num_str = num_format(percentage)
    cat_str = str_format(categories)

    #Setting graph framework
    fig = px.bar(temp_df, x=feature, y='Count', color='Churn', title=f'Churn rate by {feature}', barmode="group", color_discrete_sequence=["blue", "pink"])
    fig.add_annotation(
                text=f'Value count of distribution of {cat_str} are<br>{num_str} percentage respectively.',
                align='left',
                showarrow=False,
                xref='paper',
                yref='paper',
                x=1.4,
                y=1.3,
                bordercolor='black',
                borderwidth=1)
    fig.update_layout(
        # margin space for the annotations on the right
        margin=dict(r=400),
    )

    return fig.show()
In [5]:
#Gender feature plot
bar('gender')
#SeniorCitizen feature plot 
data_df.loc[data_df.SeniorCitizen==0,'SeniorCitizen'] = "No"   #convert 0 to No in all data instances
data_df.loc[data_df.SeniorCitizen==1,'SeniorCitizen'] = "Yes"  #convert 1 to Yes in all data instances
bar('SeniorCitizen')
#Partner feature plot
bar('Partner')
#Dependents feature plot
bar('Dependents')
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\3807008499.py:4: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'No' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

In [6]:
bar('PhoneService')
bar('MultipleLines')
bar('InternetService')
bar('OnlineSecurity')
bar('OnlineBackup')
bar('DeviceProtection')
bar('TechSupport')
bar('StreamingTV')
bar('StreamingMovies')
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

In [7]:
bar('Contract')
bar('PaperlessBilling')
bar('PaymentMethod')
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

In [8]:
data_df.dtypes
Out[8]:
customerID           object
gender               object
SeniorCitizen        object
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
In [9]:
# Let’s catch the error
try:
    data_df['TotalCharges'] = data_df['TotalCharges'].astype(float)
except ValueError as ve:
    print (ve)
could not convert string to float: ' '
In [10]:
data_df['TotalCharges'] = pd.to_numeric(data_df['TotalCharges'],errors='coerce')
#Fill the missing values with with the median value
data_df['TotalCharges'] = data_df['TotalCharges'].fillna(data_df['TotalCharges'].median())
In [13]:
# Defining the histogram plotting function
def hist(feature):
    group_df = data_df.groupby([feature, 'Churn']).size().reset_index()
    group_df = group_df.rename(columns={0: 'Count'})
    fig = px.histogram(group_df, x=feature, y='Count', color='Churn', marginal='box', title=f'Churn rate frequency to {feature} distribution', color_discrete_sequence=["blue", "pink"])
    fig.show()
In [14]:
hist('tenure')
hist('MonthlyCharges')
hist('TotalCharges')
In [15]:
#Create an empty dataframe
bin_df = pd.DataFrame()

#Update the binning dataframe
bin_df['tenure_bins'] =  pd.qcut(data_df['tenure'], q=3, labels= ['low', 'medium', 'high'])
bin_df['MonthlyCharges_bins'] =  pd.qcut(data_df['MonthlyCharges'], q=3, labels= ['low', 'medium', 'high'])
bin_df['TotalCharges_bins'] =  pd.qcut(data_df['TotalCharges'], q=3, labels= ['low', 'medium', 'high'])
bin_df['Churn'] = data_df['Churn']

#Plot the bar chart of the binned variables
bar('tenure_bins', bin_df)
bar('MonthlyCharges_bins', bin_df)
bar('TotalCharges_bins', bin_df)
C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:4: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:4: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:4: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:8: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

C:\Users\ASUS\AppData\Local\Temp\ipykernel_3156\805518032.py:10: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

In [16]:
# The customerID column isnt useful as the feature is used for identification of customers. 
data_df.drop(["customerID"],axis=1,inplace = True)

# Encode categorical features

#Defining the map function
def binary_map(feature):
    return feature.map({'Yes':1, 'No':0})

## Encoding target feature
data_df['Churn'] = data_df[['Churn']].apply(binary_map)

# Encoding gender category
data_df['gender'] = data_df['gender'].map({'Male':1, 'Female':0})

#Encoding other binary category
binary_list = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
data_df[binary_list] = data_df[binary_list].apply(binary_map)

#Encoding the other categoric features with more than two categories
data_df = pd.get_dummies(data_df, drop_first=True)
In [19]:
# Checking the correlation between features
corr = data_df.corr()

fig = px.imshow(corr,width=1000, height=1000)
fig.show()
In [20]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

#Change variable name separators to '_'
all_columns = [column.replace(" ", "_").replace("(", "_").replace(")", "_").replace("-", "_") for column in data_df.columns]

#Effect the change to the dataframe column names
data_df.columns = all_columns

#Prepare it for the GLM formula
glm_columns = [e for e in all_columns if e not in ['customerID', 'Churn']]
glm_columns = ' + '.join(map(str, glm_columns))

#Fiting it to the Generalized Linear Model
glm_model = smf.glm(formula=f'Churn ~ {glm_columns}', data=data_df, family=sm.families.Binomial())
res = glm_model.fit()
print(res.summary())
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  Churn   No. Observations:                 7043
Model:                            GLM   Df Residuals:                     7019
Model Family:                Binomial   Df Model:                           23
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2914.7
Date:                Thu, 15 Aug 2024   Deviance:                       5829.3
Time:                        13:08:44   Pearson chi2:                 8.04e+03
No. Iterations:                     7   Pseudo R-squ. (CS):             0.2807
Covariance Type:            nonrobust                                         
=================================================================================================================
                                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------------
Intercept                                         0.8274      0.748      1.106      0.269      -0.639       2.294
MultipleLines_No_phone_service[T.True]            0.3238      0.106      3.061      0.002       0.116       0.531
MultipleLines_Yes[T.True]                         0.4469      0.177      2.524      0.012       0.100       0.794
InternetService_Fiber_optic[T.True]               1.7530      0.798      2.198      0.028       0.190       3.316
InternetService_No[T.True]                       -0.2559      0.115     -2.220      0.026      -0.482      -0.030
OnlineSecurity_No_internet_service[T.True]       -0.2559      0.115     -2.220      0.026      -0.482      -0.030
OnlineSecurity_Yes[T.True]                       -0.2055      0.179     -1.150      0.250      -0.556       0.145
OnlineBackup_No_internet_service[T.True]         -0.2559      0.115     -2.220      0.026      -0.482      -0.030
OnlineBackup_Yes[T.True]                          0.0258      0.175      0.147      0.883      -0.318       0.369
DeviceProtection_No_internet_service[T.True]     -0.2559      0.115     -2.220      0.026      -0.482      -0.030
DeviceProtection_Yes[T.True]                      0.1477      0.176      0.838      0.402      -0.198       0.493
TechSupport_No_internet_service[T.True]          -0.2559      0.115     -2.220      0.026      -0.482      -0.030
TechSupport_Yes[T.True]                          -0.1789      0.180     -0.991      0.322      -0.533       0.175
StreamingTV_No_internet_service[T.True]          -0.2559      0.115     -2.220      0.026      -0.482      -0.030
StreamingTV_Yes[T.True]                           0.5912      0.326      1.813      0.070      -0.048       1.230
StreamingMovies_No_internet_service[T.True]      -0.2559      0.115     -2.220      0.026      -0.482      -0.030
StreamingMovies_Yes[T.True]                       0.6038      0.326      1.850      0.064      -0.036       1.244
Contract_One_year[T.True]                        -0.6671      0.107     -6.208      0.000      -0.878      -0.456
Contract_Two_year[T.True]                        -1.3896      0.176     -7.904      0.000      -1.734      -1.045
PaymentMethod_Credit_card__automatic_[T.True]    -0.0865      0.114     -0.758      0.448      -0.310       0.137
PaymentMethod_Electronic_check[T.True]            0.3057      0.094      3.236      0.001       0.121       0.491
PaymentMethod_Mailed_check[T.True]               -0.0567      0.115     -0.493      0.622      -0.282       0.168
gender                                           -0.0219      0.065     -0.338      0.736      -0.149       0.105
SeniorCitizen                                     0.2151      0.085      2.545      0.011       0.049       0.381
Partner                                          -0.0027      0.078     -0.035      0.972      -0.155       0.150
Dependents                                       -0.1538      0.090     -1.714      0.087      -0.330       0.022
tenure                                           -0.0594      0.006     -9.649      0.000      -0.071      -0.047
PhoneService                                      0.5036      0.692      0.728      0.467      -0.852       1.860
PaperlessBilling                                  0.3418      0.074      4.590      0.000       0.196       0.488
MonthlyCharges                                   -0.0404      0.032     -1.272      0.203      -0.103       0.022
TotalCharges                                      0.0003   7.01e-05      4.543      0.000       0.000       0.000
=================================================================================================================
In [21]:
np.exp(res.params)
Out[21]:
Intercept                                        2.287343
MultipleLines_No_phone_service[T.True]           1.382358
MultipleLines_Yes[T.True]                        1.563475
InternetService_Fiber_optic[T.True]              5.771657
InternetService_No[T.True]                       0.774257
OnlineSecurity_No_internet_service[T.True]       0.774257
OnlineSecurity_Yes[T.True]                       0.814269
OnlineBackup_No_internet_service[T.True]         0.774257
OnlineBackup_Yes[T.True]                         1.026127
DeviceProtection_No_internet_service[T.True]     0.774257
DeviceProtection_Yes[T.True]                     1.159152
TechSupport_No_internet_service[T.True]          0.774257
TechSupport_Yes[T.True]                          0.836193
StreamingTV_No_internet_service[T.True]          0.774257
StreamingTV_Yes[T.True]                          1.806134
StreamingMovies_No_internet_service[T.True]      0.774257
StreamingMovies_Yes[T.True]                      1.829067
Contract_One_year[T.True]                        0.513185
Contract_Two_year[T.True]                        0.249179
PaymentMethod_Credit_card__automatic_[T.True]    0.917142
PaymentMethod_Electronic_check[T.True]           1.357617
PaymentMethod_Mailed_check[T.True]               0.944913
gender                                           0.978355
SeniorCitizen                                    1.239957
Partner                                          0.997312
Dependents                                       0.857471
tenure                                           0.942322
PhoneService                                     1.654668
PaperlessBilling                                 1.407543
MonthlyCharges                                   0.960432
TotalCharges                                     1.000318
dtype: float64
In [22]:
#feature scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
data_df['tenure'] = sc.fit_transform(data_df[['tenure']])
data_df['MonthlyCharges'] = sc.fit_transform(data_df[['MonthlyCharges']])
data_df['TotalCharges'] = sc.fit_transform(data_df[['TotalCharges']])
In [23]:
# Import Machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#Import metric for performance evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Split data into train and test sets
from sklearn.model_selection import train_test_split
X = data_df.drop('Churn', axis=1)
y = data_df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

#Defining the modelling function
def modeling(alg, alg_name, params={}):
    model = alg(**params) #Instantiating the algorithm class and unpacking parameters if any
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    #Performance evaluation
    def print_scores(alg, y_true, y_pred):
        print(alg_name)
        acc_score = accuracy_score(y_true, y_pred)
        print("accuracy: ",acc_score)
        pre_score = precision_score(y_true, y_pred)
        print("precision: ",pre_score)
        rec_score = recall_score(y_true, y_pred)
        print("recall: ",rec_score)
        f_score = f1_score(y_true, y_pred, average='weighted')
        print("f1_score: ",f_score)

    print_scores(alg, y_test, y_pred)
    return model

# Running logistic regression model
log_model = modeling(LogisticRegression, 'Logistic Regression')
Logistic Regression
accuracy:  0.7979176526265973
precision:  0.6274509803921569
recall:  0.5745062836624776
f1_score:  0.7949702200781946
In [24]:
# Feature selection to improve model building
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
log = LogisticRegression()
rfecv = RFECV(estimator=log, cv=StratifiedKFold(10, random_state=50, shuffle=True), scoring="accuracy")
rfecv.fit(X, y)
Out[24]:
RFECV(cv=StratifiedKFold(n_splits=10, random_state=50, shuffle=True),
      estimator=LogisticRegression(), scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RFECV(cv=StratifiedKFold(n_splits=10, random_state=50, shuffle=True),
      estimator=LogisticRegression(), scoring='accuracy')
LogisticRegression()
LogisticRegression()
In [27]:
import matplotlib.pyplot as plt

# Plotting the graph for Recursive Feature Elimination (RFE)
plt.figure(figsize=(8, 6))

# Plot the mean cross-validation score vs. number of features
# Accessing mean test scores from cv_results_
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1),
         rfecv.cv_results_['mean_test_score'], marker='o')

# Adding grid for better readability
plt.grid(True)

# Set x-axis ticks to match the number of features
plt.xticks(range(1, X.shape[1] + 1))

# Labeling the axes
plt.xlabel("Number of Selected Features")
plt.ylabel("Cross-Validation Score (CV Score)")

# Adding a title to the plot
plt.title("Recursive Feature Elimination (RFE)")

# Display the plot
plt.show()

# Print the optimal number of features
print("The optimal number of features: {}".format(rfecv.n_features_))
No description has been provided for this image
The optimal number of features: 25
In [29]:
# Saving dataframe with optimal features
X_rfe = X.iloc[:, rfecv.support_]

# Overview of the optimal features in comparison with the initial dataframe
print("X dimension: {}".format(X.shape))
print("X column list:", X.columns.tolist())
print("X_rfe dimension: {}".format(X_rfe.shape))
print("X_rfe column list:", X_rfe.columns.tolist())
X dimension: (7043, 30)
X column list: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'MultipleLines_No_phone_service', 'MultipleLines_Yes', 'InternetService_Fiber_optic', 'InternetService_No', 'OnlineSecurity_No_internet_service', 'OnlineSecurity_Yes', 'OnlineBackup_No_internet_service', 'OnlineBackup_Yes', 'DeviceProtection_No_internet_service', 'DeviceProtection_Yes', 'TechSupport_No_internet_service', 'TechSupport_Yes', 'StreamingTV_No_internet_service', 'StreamingTV_Yes', 'StreamingMovies_No_internet_service', 'StreamingMovies_Yes', 'Contract_One_year', 'Contract_Two_year', 'PaymentMethod_Credit_card__automatic_', 'PaymentMethod_Electronic_check', 'PaymentMethod_Mailed_check']
X_rfe dimension: (7043, 25)
X_rfe column list: ['SeniorCitizen', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'TotalCharges', 'MultipleLines_No_phone_service', 'MultipleLines_Yes', 'InternetService_Fiber_optic', 'InternetService_No', 'OnlineSecurity_No_internet_service', 'OnlineSecurity_Yes', 'OnlineBackup_No_internet_service', 'OnlineBackup_Yes', 'DeviceProtection_No_internet_service', 'TechSupport_No_internet_service', 'TechSupport_Yes', 'StreamingTV_No_internet_service', 'StreamingTV_Yes', 'StreamingMovies_No_internet_service', 'StreamingMovies_Yes', 'Contract_One_year', 'Contract_Two_year', 'PaymentMethod_Credit_card__automatic_', 'PaymentMethod_Electronic_check']
In [31]:
# Splitting data with optimal features
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=50)

# Running logistic regression model
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

# Evaluate the model
y_pred = log_model.predict(X_test)

# You can add evaluation metrics here, for example:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.8012304779933743
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.87      1556
           1       0.64      0.57      0.60       557

    accuracy                           0.80      2113
   macro avg       0.74      0.73      0.74      2113
weighted avg       0.80      0.80      0.80      2113

In [32]:
### Trying other machine learning algorithms: SVC
svc_model = modeling(SVC, 'SVC Classification')
SVC Classification
accuracy:  0.7960246095598675
precision:  0.6431818181818182
recall:  0.5080789946140036
f1_score:  0.7877515790466652
In [33]:
#Random forest
rf_model = modeling(RandomForestClassifier, "Random Forest Classification")
Random Forest Classification
accuracy:  0.7860861334595362
precision:  0.6164079822616408
recall:  0.4991023339317774
f1_score:  0.7783618089789716
In [34]:
#Decision tree
dt_model = modeling(DecisionTreeClassifier, "Decision Tree Classification")
Decision Tree Classification
accuracy:  0.7335541883577852
precision:  0.49491525423728816
recall:  0.5242369838420108
f1_score:  0.7359592535740278
In [35]:
#Naive bayes 
nb_model = modeling(GaussianNB, "Naive Bayes Classification")
Naive Bayes Classification
accuracy:  0.6436346426881212
precision:  0.41792294807370184
recall:  0.895870736086176
f1_score:  0.6625965549469691
In [44]:
## Improve best model by hyperparameter tuning
# define model
model = LogisticRegression()

# define evaluation
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search space
from scipy.stats import loguniform
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 1000)

# define search
from sklearn.model_selection import RandomizedSearchCV
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

# execute search
result = search.fit(X_rfe, y)
# summarize result
# print('Best Score: %s' % result.best_score_)
# print('Best Hyperparameters: %s' % result.best_params_)
params = result.best_params_

#Improving the Logistic Regression model
log_model = modeling(LogisticRegression, 'Logistic Regression Classification', params=params)
Logistic Regression Classification
accuracy:  0.8007572172266919
precision:  0.6349206349206349
recall:  0.5745062836624776
f1_score:  0.7974490678620106
C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py:540: FitFailedWarning:


9390 fits failed out of a total of 15000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
364 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l2', 'elasticnet', 'l1'} or None. Got 'none' instead.

--------------------------------------------------------------------------------
598 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'elasticnet', 'l2', 'l1'} or None. Got 'none' instead.

--------------------------------------------------------------------------------
1661 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'l2', 'elasticnet'} or None. Got 'none' instead.

--------------------------------------------------------------------------------
345 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'elasticnet', 'l1', 'l2'} or None. Got 'none' instead.

--------------------------------------------------------------------------------
572 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'elasticnet', 'l2'} or None. Got 'none' instead.

--------------------------------------------------------------------------------
1590 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 67, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or None penalties, got elasticnet penalty.

--------------------------------------------------------------------------------
1140 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 67, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or None penalties, got l1 penalty.

--------------------------------------------------------------------------------
960 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 67, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or None penalties, got l1 penalty.

--------------------------------------------------------------------------------
1110 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 75, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

--------------------------------------------------------------------------------
1050 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 67, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or None penalties, got elasticnet penalty.


C:\Users\ASUS\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_search.py:1102: UserWarning:

One or more of the test scores are non-finite: [       nan        nan        nan        nan 0.79956312 0.80117236
 0.73463008 0.73463008        nan        nan        nan        nan
        nan        nan 0.80443746        nan        nan        nan
 0.73463008        nan 0.80443773 0.80230812        nan 0.73463008
 0.8043427         nan        nan        nan        nan 0.8043431
 0.73463008        nan        nan        nan        nan 0.80443766
        nan        nan 0.78413557        nan 0.80358599 0.80439025
 0.78342602 0.79842716        nan 0.73463008        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.80453229        nan 0.80410656 0.80121964        nan 0.80027254
        nan 0.80443746        nan 0.73463008        nan        nan
        nan        nan        nan 0.80448494        nan 0.80282855
 0.73463008        nan        nan 0.80434303 0.73463008        nan
 0.73463008        nan        nan 0.78455936        nan        nan
 0.73463008        nan        nan 0.8033021         nan        nan
        nan        nan 0.73463008        nan        nan        nan
        nan 0.79378889 0.80334925        nan 0.80273419        nan
 0.73463008 0.80140904        nan        nan 0.80334912        nan
        nan 0.80443746        nan        nan        nan        nan
 0.80443746 0.80420125        nan        nan 0.73463008 0.73463008
        nan        nan 0.79322064        nan 0.73463008 0.80448467
 0.80443766 0.73463008 0.73463008        nan        nan 0.80453229
 0.80448494        nan 0.80315979        nan 0.80439011        nan
 0.73463008        nan        nan        nan 0.73463008        nan
        nan 0.73463008 0.73463008        nan        nan        nan
        nan        nan 0.80453222 0.73463008 0.80405907        nan
        nan        nan        nan 0.80268711        nan        nan
        nan 0.80448474 0.80268684        nan        nan 0.80434303
 0.79757488        nan        nan 0.73463008        nan        nan
 0.80434303        nan 0.73463008        nan        nan 0.73463008
 0.8043427         nan        nan        nan        nan 0.74811802
 0.80358586 0.80344388        nan 0.79672308        nan        nan
        nan        nan        nan 0.73463008        nan 0.73463008
        nan        nan        nan 0.80429588 0.80448467 0.80457964
        nan 0.80453236        nan        nan        nan        nan
 0.80107746        nan        nan        nan 0.73463008        nan
        nan        nan        nan        nan 0.7975276         nan
 0.80358599        nan 0.73463008 0.79880628        nan        nan
 0.80192933 0.73463008        nan        nan        nan        nan
        nan 0.80415384        nan        nan 0.80448474        nan
 0.80368056 0.73463008        nan        nan        nan        nan
 0.80325455        nan        nan 0.73463008        nan 0.79601433
        nan        nan        nan 0.73463008 0.73463008 0.80330197
 0.80448494        nan 0.80306496 0.73463008        nan 0.80121958
        nan        nan 0.73463008        nan        nan        nan
        nan        nan 0.80297107 0.802971   0.80405928        nan
        nan        nan        nan 0.80439038        nan 0.73463008
        nan        nan 0.7872593  0.75971544        nan        nan
        nan        nan        nan 0.80316033 0.79667614        nan
        nan 0.80306529        nan        nan        nan 0.73463008
        nan        nan 0.80453243        nan        nan 0.73463008
        nan 0.80301835        nan 0.80443753        nan 0.80420125
        nan        nan 0.80443746        nan        nan 0.78451221
        nan 0.80372784 0.73463008        nan 0.80420125 0.80325435
        nan 0.73463008        nan        nan        nan        nan
        nan 0.80429575        nan        nan        nan 0.73463008
 0.80457964        nan        nan        nan 0.80462692        nan
 0.80358593        nan        nan        nan 0.73463008 0.73463008
        nan 0.8042486         nan 0.73463008 0.73463008 0.80443746
        nan        nan 0.73463008 0.79984707        nan 0.73463008
        nan 0.80292318 0.79236783        nan        nan 0.80339633
        nan 0.8011725  0.80268698        nan 0.80415397        nan
        nan        nan        nan        nan        nan 0.80306503
        nan 0.80088834        nan        nan 0.80448494        nan
        nan        nan        nan 0.80448474        nan        nan
 0.80339667 0.77504708        nan        nan        nan        nan
 0.73463008 0.73463008 0.80434303 0.73463008        nan        nan
 0.73463008 0.73463008 0.79113986        nan        nan        nan
        nan 0.73463008 0.80306543        nan 0.73463008 0.73463008
        nan        nan 0.80448481        nan 0.7885371         nan
        nan 0.80282902        nan        nan        nan        nan
 0.73463008        nan        nan        nan        nan        nan
        nan        nan 0.79937386 0.78337605        nan 0.80443746
 0.73463008        nan        nan        nan 0.73524547        nan
        nan        nan 0.73463008        nan        nan 0.76245856
        nan        nan 0.80457964        nan        nan        nan
        nan        nan        nan        nan        nan 0.8044378
        nan        nan        nan        nan        nan        nan
        nan 0.80358572 0.80301774        nan 0.79762217        nan
        nan        nan        nan 0.80434303        nan        nan
        nan        nan 0.80448481        nan        nan        nan
 0.80278181        nan 0.80330136        nan 0.80462692        nan
        nan        nan]

In [40]:
#Saving best model 
import joblib
#Sava the model to disk
filename = 'model.sav'
joblib.dump(log_model, filename)
Out[40]:
['model.sav']
In [ ]: